Objective: Summary of the demographics and soil characteristics of the Rwanda long term soil health study.

Study methodology

Add in details and links on study methodology here.

library(knitr)
library(ggplot2)
library(stringr)
suppressMessages(library(dplyr))
library(sp)
suppressMessages(library(dismo))
suppressMessages(library(stargazer))
library(leaflet)
library(XML)
suppressMessages(library(maptools))
library(automap)
wd <- "/Users/mlowes/drive/soil health study/data/rw baseline"
dd <- paste(wd, "data", sep = "/")
od <- paste(wd, "output", sep="/")
md <- paste(wd, "maps", sep="/")
drive <- "~/drive/r_help/4_output/statistical_test_outputs"

#load data:
# This data is being drawn from the Soil lab repository. It has the baseline data with it
d <- read.csv(paste(dd, "Rwanda_shs_commcare_soil data_final.csv", sep="/"),  stringsAsFactors=FALSE)

I’m using the merged data from Emmanuel for now but I will need to go back and download the demographic data fresh from Commcare and replicate the merge process using “Identifiers with SSN_final” provided by Emmanuel.

Included code to pair demographic and soil data?: Not yet

Cleaning baseline variables

Now let’s start cleaning the demographic variables

# take out weird CommCare stuff
d[d=="---"] <- NA

# take out demographic and text_final_question from variable names
#to.change <- c("text_final_questions.", "intro_champ_echantillon.",
#   "demographic_info.", "other_inputs_", "crop1_15b_inputs.", "crop2_15b_inputs.",
#   "^15b.", "historical_intro.")

#names(d) <- tapply(to.change, function(x) { gsub(x, "", names(d))})

names(d) <- gsub("text_fil_questions", "", names(d))
names(d) <- gsub("intro_champ_echantillon", "", names(d))
names(d) <- gsub("demographic_info", "", names(d))
names(d) <- gsub("other_inputs_", "", names(d))
names(d) <- gsub("crop1_15b_inputs", "", names(d))
names(d) <- gsub("crop2_15b_inputs", "", names(d))
names(d) <- gsub("^15b", "", names(d))
names(d) <- gsub("historical_intro", "", names(d))

names(d)[names(d)=="field_dim"] <- "field_dim1"
names(d)[names(d)=="v51"] <- "field_dim2"

Take care of demographic data formatting issues

# deal with names and drop unnecessary variables
d <- d %>% 
    dplyr::select(-c(rownumber, infoformid, introductiond_accept, photo,
        infocompleted_time, 
        enumerator_me, contains("phone"), farmer_me, farmersurme, farmerme,
        d_respondent, additiolsamplepackedandsenttol, additiolsamplerequestedfromlab,
        datedryingcompleteifnecessary, driedindistrictifnecessary, senttohqyo,
        collectedindistrictyo, excessstoredathq_, receivedathq_,dateofinitialdryingifnecessary,
        samplecollectedinfieldyo, field_des, samplewetordry)) %>%
    rename(
    female = sex,
    age = age_cultivateur,
    own = d_own,
    client = d_client) %>%
    mutate(
    female= ifelse(female=="gore", 1,0),
    field.size = field_dim1*field_dim2
    )

d$total.seasons <- apply(d[, grep("d_season_list", names(d))], 1, function(x) {
    sum(x, na.rm=T)})

Fix some more variable names:

names(d)[names(d)=="field_kg_fert1_1"] <- "field_kg_fert1_15b"
names(d)[names(d)=="field_kg_fert2_1"] <- "field_kg_fert2_15b"
names(d)[names(d)=="field_kg_compost"] <- "field_kg_compost_15b"

Recode variables to numeric:

# recode to numeric
varlist <- c("client", "own", "crop1_15b_seedkg", "crop1_15b_yield", "crop1_15b_yield_",
    "crop2_15b_seedkg", "crop2_15b_yield", "crop2_15b_yield_", "field_kg_fert1_15b",
    "field_kg_fert2_15b", "field_kg_compost_15b", "d_lime_15b", "kg_lime_15b")

# check that there aren't values hidden in the character variables
#apply(d[,varlist], 2, function(x){table(x, useNA='ifany')})

# recode characters to numerics
d[, varlist] <- sapply(d[,varlist], as.numeric)

# divide out GPS coordinates
# http://rfunction.com/archives/1499

# replace the blank gps_pic_guide with info
d <- cbind(d, str_split_fixed(d$gps_pic_guid, " ", n=4))
names(d)[106:109] <- c("lat", "lon", "alt", "precision")
d[,c("lat", "lon", "alt", "precision")] <- sapply(d[,c("lat", "lon", "alt", "precision")],
                                                  function(x){as.numeric(as.character(x))})

Check out missing data in the district variable:

#table(d$district, useNA = 'ifany')
length(d[d$district=="",])
## [1] 109
# these are the sample for which we have soil data but not survey data drop these for now
d <- d[-which(d$district==""),]

Cleaning soil data

Cleaning of soil data: Come back, check and clean the soil data before outputting to clean data set. Plot each of the soil variables to look for unrealistic values.

dim(d[is.na(d$m3.Ca),])
## [1]  11 109
d <- d[-which(is.na(d$m3.Ca)),]
       
summary(d[,c("m3.Ca", "m3.Mg", "pH", "Total.N", "Total.C")])
##      m3.Ca            m3.Mg               pH           Total.N       
##  Min.   : 220.5   Min.   :  38.26   Min.   :4.547   Min.   :0.05523  
##  1st Qu.: 443.0   1st Qu.: 114.16   1st Qu.:5.021   1st Qu.:0.13596  
##  Median : 720.2   Median : 184.98   Median :5.539   Median :0.15552  
##  Mean   : 876.4   Mean   : 207.82   Mean   :5.544   Mean   :0.15717  
##  3rd Qu.:1157.9   3rd Qu.: 270.26   3rd Qu.:6.009   3rd Qu.:0.17886  
##  Max.   :3669.2   Max.   :1008.93   Max.   :7.211   Max.   :0.24701  
##     Total.C      
##  Min.   :0.8988  
##  1st Qu.:1.7227  
##  Median :2.1118  
##  Mean   :2.1096  
##  3rd Qu.:2.3657  
##  Max.   :4.1620

Let’s check out the rows for which we don’t have soil data and drop them as they won’t contribute to the full picture.

Graphs of RW baseline soil variables

soilVars <- c("m3.Ca", "m3.Mg", "pH", "Total.N", "Total.C")
for(i in 1:length(soilVars)){
print(
  ggplot(data=d, aes(x=as.factor(client), y=d[,soilVars[i]])) + 
    geom_boxplot() +
    labs(x="Tubura Farmer", y=soilVars[i], title = paste("RW baseline soil - ", soilVars[i], sep = ""))
  )  
}

Check out soil relationships

There are biologically predictable relationships between soil chemical characteristics. For instance, we expect Ca and Mg to move in the same direction and be positively correlated with pH. If we had Aluminum as an outcome, we’d expect pH to be negatively correlated with soluable aluminum. Let’s look quickly to confirm if those relationships are present:

ggplot(d, aes(x=m3.Ca, y=m3.Mg)) + geom_point() +
    labs(x = "Calcium", y= "Magnesium", title="Calcium/Magnesium relationship")

ggplot(d, aes(x=pH, y=m3.Ca)) + geom_point() +
  labs(x = "pH", y="Calcium", title = "pH and Calcium relationship")

ggplot(d, aes(x=pH, y=m3.Mg)) + geom_point() +
  labs(x = "pH", y="Calcium", title = "pH and Magnesium relationship")

ggplot(d, aes(x=Total.C, y=Total.N)) + geom_point() + 
  labs(x = "Total Carbon", y="Total Nitrogen", title = "Carbon and Nitrogen relationship")

The soil characteristics are moving in the manner that is consistent with our understanding of soil chemical processes.

Save clean demographic and soil data to external file

write.csv(d, file=paste(paste(wd, "data", sep = "/"), "shs rw baseline.Rdata", sep = "/"))
save(d, file=paste(paste(wd, "data", sep = "/"), "shs rw baseline.Rdata", sep = "/"))

Map of baseline observations

Produce a simple map of where our observations are

e <- d[!is.na(d$lon),]
ss <- SpatialPointsDataFrame(coords = e[, c("lon", "lat")], data=e)
map <- leaflet() %>% addTiles() %>%
  setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
  addCircleMarkers(lng=ss$lon, lat=ss$lat, clusterOptions = markerClusterOptions())

map

Summary statistics

Table of final baseline breakdown

count <- d %>% group_by(district) %>% 
  dplyr::summarize(
    t.count = sum(ifelse(client==1,1,0)),
    c.count = sum(ifelse(client==0,1,0)),
    total = n()
  ) %>% ungroup()

count <- as.data.frame(count)
write.csv(count, file=paste(od, "final rw sample breakdown.csv", sep="/"), row.names=F)
as.data.frame(count)
##        district t.count c.count total
## 1   Gatsibo_LWH      58      59   117
## 2  Gatsibo_NLWH      87      81   168
## 3      Gisagara      46      46    92
## 4          Huye     114     115   229
## 5       Karongi     163     159   322
## 6       Kayonza      55      58   113
## 7      Mugonero     131     129   260
## 8     Nyamagabe      56      56   112
## 9    Nyamasheke     147     146   293
## 10       Nyanza      46      46    92
## 11    Nyaruguru      46      46    92
## 12      Rutsiro     166     166   332
## 13      Rwamaga     110     111   221

Baseline balance

Let’s see how balanced our farmers are in terms of demographic variables. Tubura farmers were selected based on (list criteria) and control farmers in the same area tha fit the same criteria were also selected. No matching process has been performed to identify the control farmers that most closely resemble the Tubura farmers in the sample.

out.list <- c("female", "age", "hhsize", "own", "field.size",
              "n_season_fert", "n_season_compost", "n_season_lime", "n_season_fallow", "n_seasons_leg_1", "n_seasons_leg_2", "m3.Ca", "m3.Mg",
              "pH", "Total.C", "Total.N")

output <- do.call(rbind, lapply(out.list, function(x) {
  
  out <- t.test(d[,x] ~ d[,"client"], data=d)
  tab <- data.frame(out[[5]][[1]], out[[5]][[2]], out[3])
  tab[,1:2] <- round(tab[,1:2],3)
  names(tab) <- c(names(out[[5]]), "pvalue")
  return(tab)
}))

# use p.adjust with bonferroni correction
output$pvalue <- p.adjust(output$pvalue, method="fdr")
rownames(output) <- out.list
output <- output[order(output$pvalue),]
output$pvalue <- ifelse(output[, 3] < 0.001, "< 0.001", round(output[, 3], 3)) 


colnames(output) <- c("Non-Tubura", "Tubura Client", "p-value") 
print(kable(output))
Non-Tubura Tubura Client p-value
n_season_fert 0.833 3.260 < 0.001
female 0.649 0.493 < 0.001
age 48.111 44.002 < 0.001
hhsize 4.896 5.416 < 0.001
n_season_lime 0.132 0.228 < 0.001
own 0.956 0.932 0.024
n_season_compost 5.523 5.818 0.126
n_season_fallow 0.570 0.690 0.136
Total.N 0.158 0.156 0.138
m3.Ca 894.242 858.574 0.157
m3.Mg 211.917 203.756 0.157
field.size 602.096 667.258 0.214
pH 5.556 5.532 0.348
Total.C 2.120 2.099 0.348
n_seasons_leg_1 2.130 2.233 0.374
n_seasons_leg_2 3.352 5.398 0.453
#write table
write.csv(output, file=paste(od, "baseline balance.csv", sep="/"), row.names=T)

Overall balance interpretation

Demographic variables We are not well balanced along the main demographic variables we collected, sex, age and HH size. For the purposes of inference we can test some matching algorithms to improve the match between Tubura and control farmers.

Agricultural practice variables We are decently balanced along agricultural practice variables. Our course of action here is similiar to our options with the demographic variables.

Soil Variables We are balanced on the primary soil variables of interest betwen our Tubura farmers and the comparison farmers.

Baseline balance by district

dist.output <- do.call(rbind, lapply(split(d, d$district), function(x) {
  
  tab <- do.call(rbind, lapply(out.list, function(y) {
    
    out <- t.test(x[,y] ~ x[,"client"], data=x)
    tab <- data.frame(out[[5]][[1]], out[[5]][[2]], out[3])
    tab[,1:2] <- round(tab[,1:2],3)
    names(tab) <- c(names(out[[5]]), "pvalue")
    #tab[,3] <- p.adjust(tab[,3], method="holm")
    #tab[,3] <- ifelse(tab[,3] < 0.001, "< 0.001", round(tab[,3],3))
    #print(tab)
    return(tab)
  }))
  
  return(data.frame(district = unique(x$district), tab))
}))

rownames(dist.output) <- NULL
dist.output$variable <- rep(out.list,13)    

# order variables 
dist.output <- dist.output[, c(1, 5, 2:4)]
dist.output$pvalue <- p.adjust(dist.output$pvalue, method="fdr")
dist.output <- dist.output[order(dist.output$pvalue),]

dist.output$pvalue <- ifelse(dist.output$pvalue < 0.001, "< 0.001", round(dist.output$pvalue,3))
colnames(dist.output) <- c("District", "Varible", "Non-Tubura", "Tubura Client", "p-value") 
print(kable(dist.output))
District Varible Non-Tubura Tubura Client p-value
70 Karongi n_season_fert 0.447 3.785 < 0.001
102 Mugonero n_season_fert 0.597 4.313 < 0.001
182 Rutsiro n_season_fert 1.072 4.175 < 0.001
134 Nyamasheke n_season_fert 1.507 5.068 < 0.001
198 Rwamaga n_season_fert 1.054 2.645 < 0.001
54 Huye n_season_fert 0.322 2.246 < 0.001
22 Gatsibo_NLWH n_season_fert 0.296 1.517 < 0.001
118 Nyamagabe n_season_fert 1.554 4.000 < 0.001
136 Nyamasheke n_season_lime 0.041 0.327 < 0.001
86 Kayonza n_season_fert 0.776 2.073 < 0.001
194 Rwamaga age 52.072 44.245 0.001
131 Nyamasheke hhsize 4.685 5.701 0.003
65 Karongi female 0.667 0.466 0.004
163 Nyaruguru hhsize 4.717 6.217 0.004
45 Gisagara m3.Mg 241.171 190.036 0.006
50 Huye age 49.400 43.395 0.009
44 Gisagara m3.Ca 1203.823 895.928 0.01
145 Nyanza female 0.543 0.217 0.012
150 Nyanza n_season_fert 0.152 1.326 0.019
18 Gatsibo_NLWH age 46.210 39.897 0.019
17 Gatsibo_NLWH female 0.556 0.322 0.021
2 Gatsibo_LWH age 49.949 41.931 0.027
113 Nyamagabe female 0.750 0.482 0.03
81 Kayonza female 0.672 0.400 0.03
20 Gatsibo_NLWH own 1.000 0.908 0.031
104 Mugonero n_season_lime 0.023 0.176 0.031
193 Rwamaga female 0.739 0.555 0.031
1 Gatsibo_LWH female 0.492 0.241 0.035
179 Rutsiro hhsize 4.855 5.608 0.046
99 Mugonero hhsize 5.054 5.802 0.049
98 Mugonero age 49.140 44.290 0.067
161 Nyaruguru female 0.652 0.391 0.077
48 Gisagara Total.N 0.154 0.142 0.079
47 Gisagara Total.C 2.005 1.816 0.086
184 Rutsiro n_season_lime 0.096 0.325 0.086
135 Nyamasheke n_season_compost 5.521 6.544 0.089
59 Huye n_seasons_leg_2 5.278 3.667 0.096
138 Nyamasheke n_seasons_leg_1 1.658 2.327 0.101
166 Nyaruguru n_season_fert 1.065 2.130 0.104
35 Gisagara hhsize 4.326 5.348 0.113
6 Gatsibo_LWH n_season_fert 1.475 2.552 0.121
51 Huye hhsize 4.574 5.132 0.141
115 Nyamagabe hhsize 4.696 5.482 0.141
41 Gisagara n_season_fallow 0.087 0.500 0.184
43 Gisagara n_seasons_leg_2 3.283 2.370 0.184
148 Nyanza own 1.000 0.913 0.197
196 Rwamaga own 0.973 0.909 0.197
46 Gisagara pH 5.945 5.759 0.213
27 Gatsibo_NLWH n_seasons_leg_2 3.889 3.046 0.22
38 Gisagara n_season_fert 0.348 1.087 0.22
72 Karongi n_season_lime 0.013 0.098 0.225
155 Nyanza n_seasons_leg_2 3.478 2.130 0.225
88 Kayonza n_season_lime 0.707 0.527 0.226
201 Rwamaga n_season_fallow 0.712 1.218 0.226
114 Nyamagabe age 48.875 43.768 0.229
176 Nyaruguru Total.N 0.170 0.161 0.238
205 Rwamaga m3.Mg 293.168 273.526 0.298
146 Nyanza age 50.152 44.717 0.335
33 Gisagara female 0.609 0.435 0.339
66 Karongi age 47.384 44.681 0.339
95 Kayonza Total.C 2.556 2.400 0.343
82 Kayonza age 46.500 42.345 0.351
64 Huye Total.N 0.148 0.143 0.353
23 Gatsibo_NLWH n_season_compost 3.420 2.632 0.366
92 Kayonza m3.Ca 1863.601 1664.716 0.366
177 Rutsiro female 0.627 0.542 0.375
183 Rutsiro n_season_compost 7.602 8.163 0.396
190 Rutsiro pH 5.342 5.261 0.396
11 Gatsibo_LWH n_seasons_leg_2 2.898 1.983 0.399
49 Huye female 0.591 0.500 0.446
84 Kayonza own 0.897 0.964 0.446
85 Kayonza field.size 423.776 664.236 0.446
96 Kayonza Total.N 0.187 0.180 0.446
108 Mugonero m3.Ca 604.767 555.744 0.446
122 Nyamagabe n_seasons_leg_1 1.125 1.625 0.446
168 Nyaruguru n_season_lime 0.000 0.043 0.446
185 Rutsiro n_season_fallow 0.289 0.470 0.446
207 Rwamaga Total.C 2.232 2.169 0.446
24 Gatsibo_NLWH n_season_lime 0.025 0.069 0.455
158 Nyanza pH 5.999 6.105 0.46
164 Nyaruguru own 0.935 0.848 0.46
188 Rutsiro m3.Ca 685.223 629.522 0.46
203 Rwamaga n_seasons_leg_2 1.685 0.536 0.46
174 Nyaruguru pH 5.210 5.319 0.466
71 Karongi n_season_compost 6.553 7.074 0.472
130 Nyamasheke age 48.123 45.898 0.499
156 Nyanza m3.Ca 987.213 1083.103 0.499
68 Karongi own 0.981 0.957 0.499
58 Huye n_seasons_leg_1 1.043 1.386 0.522
173 Nyaruguru m3.Mg 138.204 151.996 0.522
110 Mugonero pH 5.242 5.181 0.531
157 Nyanza m3.Mg 210.773 230.037 0.531
67 Karongi hhsize 4.912 5.172 0.536
175 Nyaruguru Total.C 2.229 2.140 0.548
142 Nyamasheke pH 5.123 5.179 0.568
3 Gatsibo_LWH hhsize 5.763 5.362 0.616
42 Gisagara n_seasons_leg_1 2.870 2.304 0.616
56 Huye n_season_lime 0.113 0.035 0.616
204 Rwamaga m3.Ca 1376.064 1298.489 0.616
189 Rutsiro m3.Mg 174.684 163.413 0.616
165 Nyaruguru field.size 495.239 570.022 0.621
4 Gatsibo_LWH own 1.000 0.983 0.622
52 Huye own 0.991 0.974 0.622
93 Kayonza m3.Mg 357.168 336.861 0.622
100 Mugonero own 0.915 0.947 0.622
128 Nyamagabe Total.N 0.165 0.169 0.622
191 Rutsiro Total.C 2.298 2.245 0.622
97 Mugonero female 0.713 0.656 0.627
181 Rutsiro field.size 432.241 581.313 0.63
79 Karongi Total.C 1.832 1.877 0.636
8 Gatsibo_LWH n_season_lime 0.322 0.500 0.645
37 Gisagara field.size 559.261 493.957 0.645
94 Kayonza pH 6.203 6.129 0.645
139 Nyamasheke n_seasons_leg_2 3.575 24.796 0.645
5 Gatsibo_LWH field.size 916.441 776.345 0.659
107 Mugonero n_seasons_leg_2 4.031 3.511 0.659
127 Nyamagabe Total.C 2.266 2.336 0.659
187 Rutsiro n_seasons_leg_2 2.584 2.048 0.659
206 Rwamaga pH 5.873 5.817 0.659
101 Mugonero field.size 393.124 441.359 0.66
171 Nyaruguru n_seasons_leg_2 3.370 5.326 0.661
208 Rwamaga Total.N 0.178 0.175 0.661
25 Gatsibo_NLWH n_season_fallow 0.543 0.736 0.667
129 Nyamasheke female 0.692 0.646 0.683
57 Huye n_season_fallow 0.487 0.675 0.684
133 Nyamasheke field.size 580.130 787.527 0.684
147 Nyanza hhsize 4.891 5.261 0.684
192 Rutsiro Total.N 0.159 0.157 0.684
172 Nyaruguru m3.Ca 615.230 665.044 0.687
144 Nyamasheke Total.N 0.167 0.165 0.69
117 Nyamagabe field.size 282.643 310.929 0.7
34 Gisagara age 46.848 44.761 0.703
10 Gatsibo_LWH n_seasons_leg_1 4.678 4.345 0.715
69 Karongi field.size 446.088 490.788 0.715
109 Mugonero m3.Mg 169.468 159.707 0.715
178 Rutsiro age 44.765 43.542 0.715
77 Karongi m3.Mg 269.452 254.229 0.715
21 Gatsibo_NLWH field.size 1459.901 1306.069 0.739
55 Huye n_season_compost 4.878 5.211 0.739
12 Gatsibo_LWH m3.Ca 1139.996 1206.971 0.754
39 Gisagara n_season_compost 3.957 4.326 0.754
78 Karongi pH 5.477 5.441 0.754
80 Karongi Total.N 0.142 0.144 0.754
90 Kayonza n_seasons_leg_1 2.155 1.982 0.754
153 Nyanza n_season_fallow 0.870 1.130 0.754
180 Rutsiro own 0.934 0.916 0.754
199 Rwamaga n_season_compost 3.748 4.018 0.754
106 Mugonero n_seasons_leg_1 1.186 1.336 0.764
19 Gatsibo_NLWH hhsize 5.160 5.333 0.766
91 Kayonza n_seasons_leg_2 1.259 1.091 0.766
116 Nyamagabe own 0.982 0.964 0.766
169 Nyaruguru n_season_fallow 0.261 0.370 0.766
36 Gisagara own 0.870 0.826 0.767
9 Gatsibo_LWH n_season_fallow 0.288 0.397 0.769
119 Nyamagabe n_season_compost 7.518 7.857 0.769
120 Nyamagabe n_season_lime 0.393 0.500 0.78
61 Huye m3.Mg 190.397 186.200 0.789
141 Nyamasheke m3.Mg 147.629 153.726 0.789
53 Huye field.size 567.922 599.096 0.8
123 Nyamagabe n_seasons_leg_2 2.929 3.161 0.8
29 Gatsibo_NLWH m3.Mg 254.332 246.971 0.814
132 Nyamasheke own 0.945 0.932 0.814
140 Nyamasheke m3.Ca 590.765 613.771 0.814
167 Nyaruguru n_season_compost 6.043 5.696 0.814
149 Nyanza field.size 766.217 853.359 0.818
170 Nyaruguru n_seasons_leg_1 2.348 2.043 0.819
154 Nyanza n_seasons_leg_1 2.283 2.543 0.828
197 Rwamaga field.size 842.054 892.855 0.831
202 Rwamaga n_seasons_leg_1 1.568 1.682 0.831
126 Nyamagabe pH 5.022 5.002 0.836
200 Rwamaga n_season_lime 0.324 0.355 0.836
60 Huye m3.Ca 870.054 853.743 0.847
62 Huye pH 5.665 5.684 0.848
30 Gatsibo_NLWH pH 6.061 6.038 0.852
103 Mugonero n_season_compost 6.070 6.229 0.872
137 Nyamasheke n_season_fallow 0.664 0.599 0.872
76 Karongi m3.Ca 803.893 787.032 0.89
16 Gatsibo_LWH Total.N 0.146 0.148 0.916
13 Gatsibo_LWH m3.Mg 235.470 239.641 0.929
14 Gatsibo_LWH pH 6.119 6.138 0.929
15 Gatsibo_LWH Total.C 1.866 1.887 0.929
63 Huye Total.C 1.919 1.910 0.929
105 Mugonero n_season_fallow 0.915 0.863 0.929
121 Nyamagabe n_season_fallow 0.304 0.268 0.929
124 Nyamagabe m3.Ca 449.585 456.505 0.929
125 Nyamagabe m3.Mg 108.307 106.662 0.929
143 Nyamasheke Total.C 2.285 2.298 0.929
159 Nyanza Total.C 1.778 1.761 0.929
28 Gatsibo_NLWH m3.Ca 1246.188 1230.161 0.931
75 Karongi n_seasons_leg_2 3.956 3.816 0.931
195 Rwamaga hhsize 4.982 4.927 0.931
83 Kayonza hhsize 5.155 5.091 0.932
7 Gatsibo_LWH n_season_compost 5.780 5.672 0.943
112 Mugonero Total.N 0.153 0.154 0.943
89 Kayonza n_season_fallow 0.483 0.455 0.952
160 Nyanza Total.N 0.134 0.134 0.962
31 Gatsibo_NLWH Total.C 1.994 2.002 0.964
186 Rutsiro n_seasons_leg_1 3.452 3.416 0.964
151 Nyanza n_season_compost 3.326 3.261 0.966
32 Gatsibo_NLWH Total.N 0.158 0.157 0.977
26 Gatsibo_NLWH n_seasons_leg_1 1.556 1.540 0.978
73 Karongi n_season_fallow 0.843 0.834 0.978
74 Karongi n_seasons_leg_1 2.497 2.485 0.978
87 Kayonza n_season_compost 3.534 3.564 0.978
111 Mugonero Total.C 2.203 2.206 0.978
162 Nyaruguru age 48.304 48.457 0.978
40 Gisagara n_season_lime 0.022 0.022 1
152 Nyanza n_season_lime 0.000 0.000 NA

District balance interpretation

Demographic variables interpretation here.

Agricultural practice variables interpretation here

Soil Variables interpretation here

write.csv(dist.output, file=paste(od, "district balance.csv", sep="/"), row.names=T)

Baseline balance by Tubura tenure

Look at farmers by duration of tenure farming with Tubura. We want to understand, at least with an initial naive baseline sense, what is the cumulative effect of Tubura practices on soil health outcomes?

We will look only at current Tubura farmers and compare first year farmers to farmers with more experience with Tubura.

oafOnly <- d[which(d$client==0 & d$total.seasons>=1),]
for(i in 1:length(soilVars)){
print(
  ggplot(oafOnly, aes(x=as.factor(total.seasons), y=oafOnly[,soilVars[i]])) + 
    geom_boxplot() + 
    labs(x="Tubura Tenure", y=soilVars[i], title = paste("RW baseline soil by tenure - ", soilVars[i], sep = ""))
  )  
}

Tenure summaries

tenureSum <- aggregate(oafOnly[,out.list], by=list(oafOnly$total.seasons), function(x){
  round(mean(x, na.rm=T),2)
})
tenureSum <- as.data.frame(t(tenureSum))
colnames(tenureSum) <- c(paste(seq(1,11,1), " seas.", sep = ""), "13 seas.")
print(kable(tenureSum))
1 seas. 2 seas. 3 seas. 4 seas. 5 seas. 6 seas. 7 seas. 8 seas. 9 seas. 10 seas. 11 seas. 13 seas.
Group.1 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00 13.00
female 0.70 0.61 0.67 0.65 0.64 0.58 0.75 0.33 0.33 0.75 0.67 0.00
age 48.77 46.43 46.46 48.96 42.73 43.83 42.12 52.33 36.00 48.50 42.00 52.00
hhsize 5.30 5.40 5.71 4.91 6.09 5.25 5.50 5.83 6.67 6.50 8.67 6.00
own 0.92 0.97 1.00 1.00 0.91 1.00 1.00 0.83 1.00 1.00 1.00 1.00
field.size 564.72 595.76 390.17 547.13 976.18 289.17 346.75 602.33 256.00 241.00 334.67 480.00
n_season_fert 1.03 1.27 3.17 1.65 1.82 2.83 2.38 5.33 2.67 3.75 0.33 10.00
n_season_compost 5.48 5.61 6.58 4.43 5.82 6.50 6.75 8.33 10.00 8.25 4.67 10.00
n_season_lime 0.27 0.17 0.25 0.09 0.18 0.33 0.00 0.17 0.00 0.00 0.00 0.00
n_season_fallow 0.53 0.74 0.58 1.00 0.45 0.50 0.88 0.00 0.00 0.00 1.33 0.00
n_seasons_leg_1 2.12 2.14 1.96 2.00 1.55 1.25 0.62 3.00 4.00 2.25 4.33 1.00
n_seasons_leg_2 3.13 2.85 4.50 2.57 6.27 2.33 5.88 3.50 3.33 5.25 0.00 9.00
m3.Ca 1127.45 946.96 1034.46 686.67 678.56 625.67 870.40 443.64 1153.25 483.06 746.88 632.20
m3.Mg 247.35 226.65 252.39 165.35 153.59 138.80 217.69 120.77 307.42 153.92 177.55 224.80
pH 5.72 5.56 5.70 5.20 5.34 5.34 5.64 5.10 5.87 5.18 5.50 5.54
Total.C 2.13 2.19 2.19 2.36 2.01 2.06 1.83 2.36 1.85 2.26 1.77 2.06
Total.N 0.16 0.16 0.16 0.17 0.15 0.15 0.14 0.16 0.15 0.15 0.13 0.16
oafOnly$tenured <- ifelse(oafOnly$total.seasons==1,0,1)

tenure <- do.call(rbind, lapply(out.list, function(x) {
  
  out <- t.test(oafOnly[,x] ~ oafOnly[,"tenured"], data=oafOnly)
  tab <- data.frame(out[[5]][[1]], out[[5]][[2]], out[3])
  tab[,1:2] <- round(tab[,1:2],3)
  names(tab) <- c(names(out[[5]]), "pvalue")
  return(tab)
}))

# use p.adjust with bonferroni correction
tenure$pvalue <- p.adjust(tenure$pvalue, method="fdr")
rownames(tenure) <- out.list
tenure <- tenure[order(tenure$pvalue),]
tenure$pvalue <- ifelse(tenure[, 3] < 0.001, "< 0.001", round(tenure[, 3], 3)) 


colnames(tenure) <- c("Non-Tubura", "Tubura Client", "p-value") 
print(kable(tenure))
Non-Tubura Tubura Client p-value
n_season_fert 1.032 1.944 < 0.001
m3.Ca 1127.451 863.314 0.003
pH 5.724 5.492 0.003
m3.Mg 247.352 208.608 0.033
n_season_lime 0.272 0.162 0.105
own 0.920 0.975 0.115
female 0.704 0.614 0.219
age 48.768 46.213 0.235
n_season_compost 5.480 5.914 0.56
hhsize 5.304 5.523 0.574
n_season_fallow 0.528 0.680 0.574
Total.C 2.127 2.170 0.602
n_seasons_leg_2 3.128 3.365 0.746
field.size 564.724 540.751 0.833
n_seasons_leg_1 2.120 2.036 0.833
Total.N 0.160 0.160 0.835

Tenured v. new balance interpretation

Tubura tenure is being defined here as having more than 1 year experience farming with Tubura.

Demographic variables We are well balanced along demographic variables.

Agricultural practice variables Not surprisingly, Tubura farmers have more cumulative years of fertilizer use than current non-Tubura farmers. While that difference is signficant, it is realistically only a single season of fertilizer use different.

Interestingly, non-Tubura farmers reported using more lime than current Tubura farmers. This

Soil Variables Soil pH, calcium and magnesium levels are lower for tenured Tubura farmers. This is consistent with the hypothesis that increaesd fertilizer use leads to an increaese in soil acidity.

Analysis of agronomic practices contributing to soil health outcomes

Here’s where we’ll look at the contribution of fertilizer, lime and cultivation practices on soil health outcomes. This analysis will be come richer as we gain longitudinal measures. I caution that we cannot treat these relationships as causal. The direction of causality is not clearly delineated in the data or the study design. However, we can identify meaningful connections between practices and outcomes through this analysis to generate new hypotheses for field testing.

I’m going to start with behaviors by sections and then move to a more comprehensive model including multiple practices. All models will include controls for site to account for local variation and field officer behavior.

Agronomic Behaviors

suppressMessages(library(stargazer))

list1 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  "n_season_fert + n_season_compost + n_season_lime + n_season_fallow + as.factor(district)", sep="")), data=d)
  return(mod)
})

stargazer(list1, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Naive Agronomic Practice Models",
          covariate.labels = c("Seasons of Fertilizer", "Seasons of Compost", 
                               "Seasons of Lime", "Seasons of Fallow"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for district",
          omit=c("district"), out=paste(od, "rw_baseline_agprac.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Naive Agronomic Practice Models
Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5)
Seasons of Fertilizer -2.639 0.192 0.006* -0.001*** -0.013***
(3.471) (0.855) (0.003) (0.0002) (0.004)
Seasons of Compost 3.629 0.048 0.004 0.0003* 0.003
(2.720) (0.670) (0.003) (0.0002) (0.003)
Seasons of Lime 23.072 3.831 0.010 0.002** 0.030*
(15.158) (3.734) (0.015) (0.001) (0.016)
Seasons of Fallow -31.367*** -8.037*** -0.042*** 0.001** 0.014**
(5.584) (1.376) (0.005) (0.0003) (0.006)
Constant 1,158.974*** 238.052*** 6.103*** 0.146*** 1.867***
(43.596) (10.740) (0.043) (0.003) (0.046)
Observations 2,443 2,443 2,443 2,443 2,443
R2 0.367 0.237 0.425 0.192 0.159
Adjusted R2 0.363 0.232 0.421 0.186 0.153
Residual Std. Error (df = 2426) 437.575 107.798 0.430 0.026 0.466
F Statistic (df = 16; 2426) 87.957*** 47.177*** 112.103*** 35.920*** 28.612***
Note: p<0.1; p<0.05; p<0.01
Includes FE for district

Interpretation The naive model suggests that when we include site level fixed effects, duration of agronomic practices don’t have a big effect on soil health outcomes. However, some of the practice intensity variables are not well distributed. Let’s take a look at a log transformation. I’m adding one to the variables as to not end up with lots of Inf values.

agPrac <- c(names(d[grep('n_season_', names(d))]))

for(i in 1:length(agPrac)){
  print(
    ggplot(d, aes(x=d[,agPrac[i]])) + geom_histogram(binwidth = 1) +
      labs(x = agPrac[i])
        )
}

# since these are all skewed, consider a log transform 
for(i in 1:length(agPrac)){
  print(
    ggplot(d, aes(x=log(d[,agPrac[i]]+1))) + geom_histogram(binwidth = 1) +
      labs(x = agPrac[i])
        )
}

d$logFert <- log(d$n_season_fert+1)
d$logCompost <- log(d$n_season_compost+1)
d$logLime <- log(d$n_season_lime+1)
d$logFallow <- log(d$n_season_fallow+1)

Let’s look at the log results:

logVars <- paste(names(d[grep("log", names(d))]), collapse=" + ")

list2 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  logVars, "+ as.factor(district)", sep="")), data=d)
  return(mod)
})

stargazer(list2, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Naive Agronomic Practice Models",
          covariate.labels = c("Seasons of Fertilizer (log)", "Seasons of Compost (log)", "Seasons of Lime (log)", "Seasons of Fallow (log)"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for district",
          omit=c("district"), out=paste(od, "rw_baseline_agprac_log.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Naive Agronomic Practice Models
Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5)
Seasons of Fertilizer (log) -12.831 -0.801 0.018 -0.003*** -0.048***
(12.150) (2.995) (0.012) (0.001) (0.013)
Seasons of Compost (log) 17.314 -0.115 0.015 0.002** 0.018
(12.513) (3.084) (0.012) (0.001) (0.013)
Seasons of Lime (log) 61.473* 9.814 0.024 0.004** 0.078**
(33.355) (8.222) (0.033) (0.002) (0.036)
Seasons of Fallow (log) -114.067*** -28.722*** -0.151*** 0.002** 0.038**
(16.284) (4.014) (0.016) (0.001) (0.017)
Constant 1,159.526*** 240.777*** 6.111*** 0.146*** 1.859***
(45.388) (11.188) (0.045) (0.003) (0.049)
Observations 2,443 2,443 2,443 2,443 2,443
R2 0.372 0.243 0.432 0.192 0.159
Adjusted R2 0.368 0.238 0.428 0.186 0.153
Residual Std. Error (df = 2426) 435.826 107.428 0.428 0.026 0.466
F Statistic (df = 16; 2426) 89.885*** 48.550*** 115.129*** 35.963*** 28.644***
Note: p<0.1; p<0.05; p<0.01
Includes FE for district

Interpretation: When we transform the variables to log, the data starts to tell a more coherent story.

  • Duration of fertilizer use negatively effects all soil health parameters
  • Curiously, duration of compost use does as well. However, this may be indicative of the plot being farmed repeatedly. We should look into this further
  • Duration of lime use improves soil health metrics across the board
  • The relationships of extended fallows to soil health parameters are also surprising. We see additional years of fallow harming Ca, Mg, and pH but having a positive effect on total carbon. Our data is not specific enough to say whether these fallows were improved fallows or bare fallows, a distinction which may explain the relationships we’re seeing.

Tenure with One Acre Fund

Let’s look first at a naive model of One Acre Fund tenure on soil health. Remember: these data are not longitudinal! These data are not longitudinal and reflect farmer selection into One Acre Fund. While these models will try to be both robust and parsimonious, we will inevitabily suffer omitted variable bias due to a lack of an instrument.

list3 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  "total.seasons + as.factor(district)", sep="")), data=d)
  return(mod)
})

stargazer(list3, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Naive Tenure Models",
          covariate.labels = c("OAF Tenure"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for district",
          omit=c("district"), out=paste(od, "rw_baseline_tenure.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Naive Tenure Models
Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5)
OAF Tenure 3.388 1.679** 0.011*** -0.001*** -0.010***
(2.945) (0.725) (0.003) (0.0002) (0.003)
Constant 1,166.566*** 234.251*** 6.108*** 0.149*** 1.896***
(41.145) (10.124) (0.041) (0.002) (0.044)
Observations 2,443 2,443 2,443 2,443 2,443
R2 0.357 0.227 0.411 0.189 0.156
Adjusted R2 0.354 0.223 0.407 0.184 0.151
Residual Std. Error (df = 2429) 440.658 108.424 0.435 0.026 0.467
F Statistic (df = 13; 2429) 103.913*** 55.019*** 130.144*** 43.489*** 34.414***
Note: p<0.1; p<0.05; p<0.01
Includes FE for district

Interpretation: The naive One Acre Fund tenure model suggest that across the board that additional years of 1AF practices have a negative effect on soil health parameters. Let’s combine 1AF tenure with the agronomic practices model above to build a more robust model:

list4 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  logVars, "+ total.seasons + as.factor(district)", sep="")), data=d)
  return(mod)
})

stargazer(list4, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Ag Practice and Tenure",
          covariate.labels = c("Seasons of Fertilizer (log)", "Seasons of Compost (log)", "Seasons of Lime (log)", "Seasons of Fallow (log)", "OAF Tenure"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for district",
          omit=c("district"), out=paste(od, "rw_baseline_ag_tenure.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Ag Practice and Tenure
Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5)
Seasons of Fertilizer (log) -25.873* -6.471* -0.006 -0.002** -0.033**
(14.577) (3.589) (0.014) (0.001) (0.016)
Seasons of Compost (log) 17.302 -0.121 0.015 0.002** 0.018
(12.508) (3.080) (0.012) (0.001) (0.013)
Seasons of Lime (log) 63.791* 10.822 0.028 0.004** 0.075**
(33.375) (8.217) (0.033) (0.002) (0.036)
Seasons of Fallow (log) -114.747*** -29.018*** -0.152*** 0.002** 0.039**
(16.284) (4.009) (0.016) (0.001) (0.017)
OAF Tenure 5.760 2.504*** 0.011*** -0.001*** -0.007*
(3.560) (0.876) (0.003) (0.0002) (0.004)
Constant 1,158.003*** 240.115*** 6.108*** 0.146*** 1.861***
(45.382) (11.174) (0.044) (0.003) (0.049)
Observations 2,443 2,443 2,443 2,443 2,443
R2 0.373 0.245 0.434 0.194 0.160
Adjusted R2 0.368 0.240 0.430 0.188 0.154
Residual Std. Error (df = 2425) 435.680 107.270 0.427 0.026 0.466
F Statistic (df = 17; 2425) 84.808*** 46.309*** 109.252*** 34.365*** 27.180***
Note: p<0.1; p<0.05; p<0.01
Includes FE for district

Interpretation: Including agronomic practices and 1AF tenure in the same model dampens the magnitude, but not the significance, of 1AF tenure on soil health outcomes.

Agronomic practices - 15B cultivation practices

Thus far we have looked at aggregated historical plot level practices and their effect on soil health. We also asked farmers about their cultivation practices on their plot in the previous season, 15B. We have more precise information for fertilizer, compost and liming practices for the 15B season.

# this should be kg of fertilizer used in this field. Compost is off the charts. Convert this to compost per sq meter
d$fert_15b_log <- ifelse(d$field_kg_fert1_15b<=20, 
                         log(d$field_kg_fert1_15b+1), NA)

d$field_dim <- d$field_dim1*d$field_dim2

d$compost_15b_sqm <- d$field_kg_compost_15b/d$field_dim
d$compost_15b_sqm_log <- log(d$compost_15b_sqm+1)


# more than 20 kg per are seems like too much.
suppressMessages(qplot(d$fert_15b_log, binwidth=1))
## Warning: Removed 1808 rows containing non-finite values (stat_bin).

suppressMessages(qplot(d$compost_15b_sqm_log, binwidth=1))
## Warning: Removed 143 rows containing non-finite values (stat_bin).

previousSeason <- paste("fert_15b_log", "compost_15b_sqm_log", "as.factor(district)", sep=" + ")

list5 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", previousSeason, sep="")), data=d)
  return(mod)
})
stargazer(list5, type="html", 
          title = "2016A Rwanda Soil Health Baseline - 15B practices",
          covariate.labels = c("Fertilizer Rate (log)", "Compost Rate kg/m^2 (log)"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for district",
          omit=c("district"), out=paste(od, "rw_baseline_15b_ag.htm", sep="/"))
2016A Rwanda Soil Health Baseline - 15B practices
Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5)
Fertilizer Rate (log) -54.919** -16.618** -0.087*** 0.001 0.020
(27.711) (7.772) (0.031) (0.002) (0.034)
Compost Rate kg/m(log) 55.915 17.782 -0.016 0.001 0.032
(41.802) (11.724) (0.046) (0.003) (0.052)
Constant 1,362.562*** 267.001*** 6.357*** 0.144*** 1.789***
(98.218) (27.546) (0.109) (0.007) (0.122)
Observations 635 635 635 635 635
R2 0.447 0.255 0.380 0.196 0.146
Adjusted R2 0.434 0.238 0.366 0.178 0.127
Residual Std. Error (df = 620) 382.674 107.324 0.424 0.026 0.476
F Statistic (df = 14; 620) 35.734*** 15.131*** 27.146*** 10.782*** 7.564***
Note: p<0.1; p<0.05; p<0.01
Includes FE for district

Let’s look at farmer perceived fertility as a predictor of soil health. We’ll set ‘same fertility’ as the reference category.

d$fertility_qual <- as.factor(d$general_field_infocompare_fertil)
d <- within(d, fertility_qual <- relevel(fertility_qual, ref="same"))

list6 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", "+ fertility_qual + as.factor(district)", sep="")), data=d)
  return(mod)
})


stargazer(list6, type="html", 
          title = "2016A Rwanda Soil Health Baseline - 15B practices",
          covariate.labels = c("Farmer Opinion - Less Fertile",
                               "Farmer Opinion - More Fertile"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Reference category is same fertility as other fields. Includes FE for district",
          notes.align = "l",
          omit=c("district"), out=paste(od, "rw_baseline_fertility_qual.htm", sep="/"))
2016A Rwanda Soil Health Baseline - 15B practices
Ca Mg pH Total.N Total.C
(1) (2) (3) (4) (5)
Farmer Opinion - Less Fertile -106.273*** -28.777*** -0.144*** 0.005*** 0.079***
(22.428) (5.512) (0.022) (0.001) (0.024)
Farmer Opinion - More Fertile 62.661** 17.929*** 0.066*** -0.001 -0.005
(24.803) (6.096) (0.024) (0.001) (0.026)
Constant 1,189.851*** 241.941*** 6.153*** 0.146*** 1.859***
(40.933) (10.061) (0.040) (0.002) (0.044)
Observations 2,443 2,443 2,443 2,443 2,443
R2 0.367 0.240 0.422 0.187 0.156
Adjusted R2 0.363 0.236 0.419 0.182 0.151
Residual Std. Error (df = 2428) 437.503 107.531 0.431 0.026 0.467
F Statistic (df = 14; 2428) 100.470*** 54.903*** 126.749*** 39.872*** 32.032***
Note: p<0.1; p<0.05; p<0.01
Reference category is same fertility as other fields. Includes FE for district

Interpretation: Farmers understand their fields well. Their categorization of which field are more and less fertile corresponds to our quantified measures of soil health. The only features farmers don’t seem to get correct are nitrogen and carbon. The nitrogen and carbon levels are actually higher in the fields farmers called ‘low fertility’ relative to the fields deemed to be the ‘same fertility.’ Reminder: We need to remember that farmers are only evaluting one of their fields thus we are not able to account for the quality of the farmer in assessing his/her fields.

Measured Yields

We don’t have measured yield data for the Rwanda baseline. Skip this for now.

Distributions of soil health levels for different sub-groups

Visualizations

Soil health maps for soil health study areas

#crdref <- CRS('+proj=longlat +datum=WGS84')
crdref <- CRS('+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0')
e <- d[!is.na(d$lon),]
ss <- SpatialPointsDataFrame(coords = e[, c("lon", "lat")], data=e, proj4string = crdref)

rw <- getData("GADM", country='RW', level=3, path = "/Users/mlowes/drive/soil health study/data") # the higher the number, the higher the resolution

#ext.rw.ss matches points with spatial polygons in rw
ext.rw.ss <- extract(rw[, "NAME_3"], ss)
## Loading required namespace: rgeos
ss$spatialname <- ext.rw.ss$NAME_3

frw <- fortify(rw, region="NAME_3")
ss.soil <- aggregate(ss@data[,soilVars], by=list(ss@data$spatialname), function(x){
  mean(x, na.rm=T)
})

plotReady <- dplyr::left_join(frw, ss.soil, by=c("id"="Group.1"))

Plot simple summary of soil characteristics

Consider revising these maps to a smaller greographic unit. Add the name of the location for uninitiated users.

library(RColorBrewer)
mapList <- list()
for(i in 1:length(soilVars)){
mapRes <-  ggplot(plotReady, aes(x=long, y=lat, group=group)) + geom_path() + 
  geom_polygon(aes(fill=plotReady[,soilVars[i]])) + 
  scale_fill_gradientn(colours = rev(brewer.pal(9,"Reds")), # define colors
                       name = soilVars[i],
                       guide = guide_colorbar(legend.direction = "vertical")) + 
  theme_bw() + 
  labs(title=paste("Rwanda long term soil health baseline - 2016", soilVars[i], sep= " "), x = "Longitude", y="Latitude")
mapList[[i]] <- mapRes
print(mapRes)
}  

pdf(file=paste(md, "rw_shs_baseline_soil_maps.pdf", sep = "/"), width=11, height=8.5)
for(i in 1:length(mapList)){
print(mapList[[i]])
}  
dev.off()
## quartz_off_screen 
##                 2

Interpolations of soil health values

Interpolate soil health values for full operating area using soil health study values. We want to eventually add all Rwandan soil values into a single dataset to update and hone these values. See here for more guidanace

note:

  • Layer on the Tubura sites to the interpolated map.
  • Use leaflet so you can zoom in and out more easily.
  • Make it a raster layer that you can click on understand the values at different locations and the name of the location. I’ll need to make this a shiny app to have that functionality

The code below will run 5 K-fold cross validation to compare interpolation models. The output will be fed into the interpolate leaflet code below.

Check that I’m handling the projection correctly with Robert

#proj4string(ss) <- CRS("+init=epsg:4326")
ss <- spTransform(ss, CRS=("+proj=utm +zone=36N +datum=WGS84"))
suppressMessages(library(fields))
library(gstat)
library(htmltools)

# root mean sq error for evaluating models
RMSE <- function(observed, predicted) {
  sqrt(mean((predicted - observed)^2, na.rm=TRUE))
}

# set k folds to 5
set.seed(20161030)
nfolds <- 5
k <- kfold(ss, nfolds) # from dismo

# cross validation of models
ensrmse <- tpsrmse <- idwrmse <- rep(NA, 5) # assing multiple objects at once

cv <- function(x) {
  
  for(i in 1:nfolds) {
    train <- ss[k!=i,]
    test <- ss[k==i,]

  train <- train[!is.na(train@data[,x]),]
  
  # inverse distance weights
  m <- gstat(formula=as.formula(paste(x, '~ 1')), locations=train)
  p1 <- predict(m, newdata=test, debug.level=0)$var1.pred
  idwrmse[i] <-  RMSE(test@data[,x], p1) #idw rsme
  
  # krieging
  # m <- autoKrige(formula=as.formula(paste(x, "~ 1")), input_data = train)
  # p2 <- predict(m, newdata=test, debug.level=0)$var1.pred
  # krigrmse[i] <-  RMSE(test$OZDLYAV, p2)
  
  # thin plate spline
  m <- Tps(coordinates(train), train@data[,x]) 
  p3 <- predict(m, coordinates(test))
  tpsrmse[i] <-  RMSE(test@data[,x], p3)
  
  w <- c(idwrmse[i], tpsrmse[i]) # combine the rmse
  weights <- w / sum(w) # weight them
  ensemble <- p1 * weights[1] + p3 * weights[2] 
  # multiply predictions by weights
  ensrmse[i] <-  RMSE(test@data[,x], ensemble) # truly an ensemble result?
  }
  
  output <- rbind(idwrmse, tpsrmse, ensrmse)
  return(output)
  
}

Now loop over the variables of interest where x is the soilVar variable.

output <- lapply(soilVars, function(x){
  ini <- data.frame(cv(x))
  ini$ave <- apply(ini[,1:5], 1, function(y){mean(y, na.rm=T)})
  res <- paste("Best model is ", row.names(ini[which.min(ini$ave),]),  sep = "")
  return(list(ini, res))
})

Interpolated soil maps

r <- raster(res=1/12)
r <- crop(r, floor(extent(rw)))

maps <- lapply(soilVars, function(x){
  m <- Tps(coordinates(ss), ss@data[,x])
  # make raster layer with model, raster is rwanda empty raster, model is m
  tps <- interpolate(r, m)
  tps <- crop(tps, rw)
  tps <- mask(tps, rw) # cuts the tps raster down to the rw boundaries
  x <- gsub("m3.", "", x)
  
  pal <- colorNumeric(palette = "Reds", values(tps), na.color = "transparent")
  
  suppressWarnings(
  leaflet() %>% addTiles() %>%
  addRasterImage(tps, colors=pal, opacity = 0.8) %>%
  setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
  addLegend(pal = pal, values = values(tps), title = paste("Soil Value ", x, sep=""))
  )
})

tagList(maps)

Print out the interpolated values for inclusion in the report.

## NULL
## NULL
## NULL
## NULL
## NULL
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## quartz_off_screen 
##                 2

Geographic Descriptors

This section will look at improving understanding of local context to inform local adaptation. It will also construct soil profiles to simplify the scaling of promising products and practices to targeted locations.

–end